1. Collate the 2 excel files to have all the information at one place.¶

In [2]:
# All libraries/functions required

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
import plotly.express as px
from statsmodels.formula.api import ols
import statsmodels.api  as sm
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from sklearn.linear_model import SGDRegressor, Ridge
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
#import xgboost as xgb
from sklearn.pipeline import Pipeline
#from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
In [3]:
address = 'D:/Capstone_Project/Healthcare_Datasets/'
In [4]:
hosp = pd.read_csv(address + 'Hospitalisation details.csv')
medic = pd.read_csv(address + 'Medical Examinations.csv')
names = pd.read_excel(address + 'Names.xlsx')

Data inspection using .info()¶

In [6]:
hosp.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2343 entries, 0 to 2342
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Customer ID    2343 non-null   object 
 1   year           2343 non-null   object 
 2   month          2343 non-null   object 
 3   date           2343 non-null   int64  
 4   children       2343 non-null   int64  
 5   charges        2343 non-null   float64
 6   Hospital tier  2343 non-null   object 
 7   City tier      2343 non-null   object 
 8   State ID       2343 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 164.9+ KB
In [7]:
medic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             2335 non-null   object 
 1   BMI                     2335 non-null   float64
 2   HBA1C                   2335 non-null   float64
 3   Heart Issues            2335 non-null   object 
 4   Any Transplants         2335 non-null   object 
 5   Cancer history          2335 non-null   object 
 6   NumberOfMajorSurgeries  2335 non-null   object 
 7   smoker                  2335 non-null   object 
dtypes: float64(2), object(6)
memory usage: 146.1+ KB
In [8]:
names.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Customer ID  2335 non-null   object
 1   name         2335 non-null   object
dtypes: object(2)
memory usage: 36.6+ KB
In [9]:
master_data = pd.merge(hosp, medic, how = 'inner', on = 'Customer ID')
In [10]:
master_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             2335 non-null   object 
 1   year                    2335 non-null   object 
 2   month                   2335 non-null   object 
 3   date                    2335 non-null   int64  
 4   children                2335 non-null   int64  
 5   charges                 2335 non-null   float64
 6   Hospital tier           2335 non-null   object 
 7   City tier               2335 non-null   object 
 8   State ID                2335 non-null   object 
 9   BMI                     2335 non-null   float64
 10  HBA1C                   2335 non-null   float64
 11  Heart Issues            2335 non-null   object 
 12  Any Transplants         2335 non-null   object 
 13  Cancer history          2335 non-null   object 
 14  NumberOfMajorSurgeries  2335 non-null   object 
 15  smoker                  2335 non-null   object 
dtypes: float64(3), int64(2), object(11)
memory usage: 292.0+ KB
In [11]:
master_data = master_data.merge(names,on='Customer ID')
In [12]:
master_data.head()
Out[12]:
Customer ID year month date children charges Hospital tier City tier State ID BMI HBA1C Heart Issues Any Transplants Cancer history NumberOfMajorSurgeries smoker name
0 Id2335 1992 Jul 9 0 563.84 tier - 2 tier - 3 R1013 17.58 4.51 No No No 1 No German, Mr. Aaron K
1 Id2334 1992 Nov 30 0 570.62 tier - 2 tier - 1 R1013 17.60 4.39 No No No 1 No Rosendahl, Mr. Evan P
2 Id2333 1993 Jun 30 0 600.00 tier - 2 tier - 1 R1013 16.47 6.35 No No Yes 1 No Albano, Ms. Julie
3 Id2332 1992 Sep 13 0 604.54 tier - 3 tier - 3 R1013 17.70 6.28 No No No 1 No Riveros Gonzalez, Mr. Juan D. Sr.
4 Id2331 1998 Jul 27 0 637.26 tier - 3 tier - 3 R1013 22.34 5.57 No No No 1 No Brietzke, Mr. Jordan
In [13]:
master_data.shape
Out[13]:
(2335, 17)

2. Check for missing values and duplicates before joining the 2 datasets.¶

In [15]:
master_data.isnull().sum()
Out[15]:
Customer ID               0
year                      0
month                     0
date                      0
children                  0
charges                   0
Hospital tier             0
City tier                 0
State ID                  0
BMI                       0
HBA1C                     0
Heart Issues              0
Any Transplants           0
Cancer history            0
NumberOfMajorSurgeries    0
smoker                    0
name                      0
dtype: int64
In [16]:
master_data['Hospital tier'].unique()
Out[16]:
array(['tier - 2', 'tier - 3', '?', 'tier - 1'], dtype=object)
In [17]:
master_data.columns
Out[17]:
Index(['Customer ID', 'year', 'month', 'date', 'children', 'charges',
       'Hospital tier', 'City tier', 'State ID', 'BMI', 'HBA1C',
       'Heart Issues', 'Any Transplants', 'Cancer history',
       'NumberOfMajorSurgeries', 'smoker', 'name'],
      dtype='object')
In [18]:
master_data.year.head(2)
Out[18]:
0    1992
1    1992
Name: year, dtype: object
In [19]:
master_data["Customer ID"].head()
Out[19]:
0    Id2335
1    Id2334
2    Id2333
3    Id2332
4    Id2331
Name: Customer ID, dtype: object
In [20]:
# Removing spaces from column names
master_data.columns = master_data.columns.str.lower() # making all column names as lower case
master_data.columns = master_data.columns.str.replace(' ', '_') # converted all space between column name as _(underscore)
master_data.columns
Out[20]:
Index(['customer_id', 'year', 'month', 'date', 'children', 'charges',
       'hospital_tier', 'city_tier', 'state_id', 'bmi', 'hba1c',
       'heart_issues', 'any_transplants', 'cancer_history',
       'numberofmajorsurgeries', 'smoker', 'name'],
      dtype='object')

3. Find the percentage of rows that have trivial value (for example, ?), and delete such rows if they do¶

not contain significant information

In [22]:
master_data == '?'
Out[22]:
customer_id year month date children charges hospital_tier city_tier state_id bmi hba1c heart_issues any_transplants cancer_history numberofmajorsurgeries smoker name
0 False False False False False False False False False False False False False False False False False
1 False False False False False False False False False False False False False False False False False
2 False False False False False False False False False False False False False False False False False
3 False False False False False False False False False False False False False False False False False
4 False False False False False False False False False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2330 False False False False False False False False False False False False False False False False False
2331 False False False False False False False False False False False False False False False False False
2332 False False True False False False False False False False False False False False False False False
2333 False False False False False False False False False False False False False False False False False
2334 False False False False False False False False False False False False False False False False False

2335 rows × 17 columns

In [23]:
(master_data == '?').sum()
Out[23]:
customer_id               0
year                      2
month                     3
date                      0
children                  0
charges                   0
hospital_tier             1
city_tier                 1
state_id                  2
bmi                       0
hba1c                     0
heart_issues              0
any_transplants           0
cancer_history            0
numberofmajorsurgeries    0
smoker                    2
name                      0
dtype: int64
In [24]:
(master_data == '?').sum(axis = 1)
Out[24]:
0       0
1       0
2       0
3       0
4       0
       ..
2330    0
2331    0
2332    1
2333    0
2334    0
Length: 2335, dtype: int64
In [25]:
master_data.shape[1]
Out[25]:
17
In [26]:
miss_perc = (master_data == '?').sum(axis = 1)/master_data.shape[1] * 100
miss_perc
Out[26]:
0       0.000000
1       0.000000
2       0.000000
3       0.000000
4       0.000000
          ...   
2330    0.000000
2331    0.000000
2332    5.882353
2333    0.000000
2334    0.000000
Length: 2335, dtype: float64
In [27]:
miss_perc[miss_perc > 0] # it will filter index with "?" value
Out[27]:
11       5.882353
13       5.882353
17      11.764706
542      5.882353
1046     5.882353
1049     5.882353
1700     5.882353
1775     5.882353
2165     5.882353
2332     5.882353
dtype: float64
In [28]:
miss_perc[miss_perc>0].index
Out[28]:
Index([11, 13, 17, 542, 1046, 1049, 1700, 1775, 2165, 2332], dtype='int64')
In [29]:
miss_perc_col = (master_data == '?').sum(axis = 0)/master_data.shape[0] * 100
miss_perc_col.sort_values(ascending= False)
Out[29]:
month                     0.128480
state_id                  0.085653
smoker                    0.085653
year                      0.085653
hospital_tier             0.042827
city_tier                 0.042827
heart_issues              0.000000
numberofmajorsurgeries    0.000000
cancer_history            0.000000
any_transplants           0.000000
customer_id               0.000000
hba1c                     0.000000
bmi                       0.000000
charges                   0.000000
children                  0.000000
date                      0.000000
name                      0.000000
dtype: float64
In [30]:
master_noq = master_data.drop(index = miss_perc[miss_perc>0].index)
master_noq.shape
Out[30]:
(2325, 17)
In [31]:
(master_noq == '?').sum() 
Out[31]:
customer_id               0
year                      0
month                     0
date                      0
children                  0
charges                   0
hospital_tier             0
city_tier                 0
state_id                  0
bmi                       0
hba1c                     0
heart_issues              0
any_transplants           0
cancer_history            0
numberofmajorsurgeries    0
smoker                    0
name                      0
dtype: int64

4. Use the necessary transformation methods to deal with the nominal and ordinal categorical¶

variables in the dataset

  1. nominal categorical = heart_issues any_transplants cancer_history smoker and state_id - generally dummy variables are created
  2. ordinal categorical = city tier and hospital tier - numbers are assigned to categories based on rank
In [34]:
master_noq[['city_tier', 'hospital_tier']]
Out[34]:
city_tier hospital_tier
0 tier - 3 tier - 2
1 tier - 1 tier - 2
2 tier - 1 tier - 2
3 tier - 3 tier - 3
4 tier - 3 tier - 3
... ... ...
2329 tier - 3 tier - 1
2330 tier - 2 tier - 1
2331 tier - 3 tier - 1
2333 tier - 3 tier - 2
2334 tier - 3 tier - 1

2325 rows × 2 columns

In [35]:
master_noq.state_id.value_counts()
Out[35]:
state_id
R1013    609
R1011    574
R1012    572
R1024    159
R1026     84
R1021     70
R1016     64
R1025     40
R1023     38
R1017     36
R1019     26
R1022     14
R1014     13
R1015     11
R1018      9
R1020      6
Name: count, dtype: int64
In [36]:
# Using ordinalencoder to deal with ordinal categorical variables - city tier and hospital tier

ordinal = OrdinalEncoder(categories= [['tier - 3', 'tier - 2', 'tier - 1'],['tier - 3', 'tier - 2', 'tier - 1']])
master_noq[['city_tier_ord','hospital_tier_ord']] = ordinal.fit_transform(master_noq[['city_tier', 'hospital_tier']])
In [37]:
master_noq.head()
Out[37]:
customer_id year month date children charges hospital_tier city_tier state_id bmi hba1c heart_issues any_transplants cancer_history numberofmajorsurgeries smoker name city_tier_ord hospital_tier_ord
0 Id2335 1992 Jul 9 0 563.84 tier - 2 tier - 3 R1013 17.58 4.51 No No No 1 No German, Mr. Aaron K 0.0 1.0
1 Id2334 1992 Nov 30 0 570.62 tier - 2 tier - 1 R1013 17.60 4.39 No No No 1 No Rosendahl, Mr. Evan P 2.0 1.0
2 Id2333 1993 Jun 30 0 600.00 tier - 2 tier - 1 R1013 16.47 6.35 No No Yes 1 No Albano, Ms. Julie 2.0 1.0
3 Id2332 1992 Sep 13 0 604.54 tier - 3 tier - 3 R1013 17.70 6.28 No No No 1 No Riveros Gonzalez, Mr. Juan D. Sr. 0.0 0.0
4 Id2331 1998 Jul 27 0 637.26 tier - 3 tier - 3 R1013 22.34 5.57 No No No 1 No Brietzke, Mr. Jordan 0.0 0.0
In [38]:
# --- Encode binary nominal columns (Yes/No) ---
#binary_cols = ['heart_issues', 'any_transplants', 'cancer_history']
#for col in binary_cols:
 #   master_noq[col] = master_noq[col].map({'No': 0, 'Yes': 1})
In [39]:
master_noq.info()
<class 'pandas.core.frame.DataFrame'>
Index: 2325 entries, 0 to 2334
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   customer_id             2325 non-null   object 
 1   year                    2325 non-null   object 
 2   month                   2325 non-null   object 
 3   date                    2325 non-null   int64  
 4   children                2325 non-null   int64  
 5   charges                 2325 non-null   float64
 6   hospital_tier           2325 non-null   object 
 7   city_tier               2325 non-null   object 
 8   state_id                2325 non-null   object 
 9   bmi                     2325 non-null   float64
 10  hba1c                   2325 non-null   float64
 11  heart_issues            2325 non-null   object 
 12  any_transplants         2325 non-null   object 
 13  cancer_history          2325 non-null   object 
 14  numberofmajorsurgeries  2325 non-null   object 
 15  smoker                  2325 non-null   object 
 16  name                    2325 non-null   object 
 17  city_tier_ord           2325 non-null   float64
 18  hospital_tier_ord       2325 non-null   float64
dtypes: float64(5), int64(2), object(12)
memory usage: 363.3+ KB

5. The dataset has State ID, which has around 16 states. All states are not represented in equal proportions in the data. Creating dummy variables for all regions may also result in too many insignificant predictors. Nevertheless, only R1011, R1012, and R1013 are worth investigating further. Design a suitable strategy to create dummy variables with these restraints.¶

In [41]:
vc = master_noq.state_id.value_counts()   # frequency of each category
vc[:3].index       
Out[41]:
Index(['R1013', 'R1011', 'R1012'], dtype='object', name='state_id')
In [42]:
for i in vc[:3].index:
    var_name = 'state_id_' +i   # create name for the dummy varible
    print(var_name)             
    master_noq[var_name] = 0    # giving a dummy value 0 to dummy variable
    master_noq.loc[master_noq.state_id == i,var_name] = 1  # replacing 0 by 1 where state id is equal to category of the dummy variable
state_id_R1013
state_id_R1011
state_id_R1012
In [43]:
master_noq.head()
Out[43]:
customer_id year month date children charges hospital_tier city_tier state_id bmi ... any_transplants cancer_history numberofmajorsurgeries smoker name city_tier_ord hospital_tier_ord state_id_R1013 state_id_R1011 state_id_R1012
0 Id2335 1992 Jul 9 0 563.84 tier - 2 tier - 3 R1013 17.58 ... No No 1 No German, Mr. Aaron K 0.0 1.0 1 0 0
1 Id2334 1992 Nov 30 0 570.62 tier - 2 tier - 1 R1013 17.60 ... No No 1 No Rosendahl, Mr. Evan P 2.0 1.0 1 0 0
2 Id2333 1993 Jun 30 0 600.00 tier - 2 tier - 1 R1013 16.47 ... No Yes 1 No Albano, Ms. Julie 2.0 1.0 1 0 0
3 Id2332 1992 Sep 13 0 604.54 tier - 3 tier - 3 R1013 17.70 ... No No 1 No Riveros Gonzalez, Mr. Juan D. Sr. 0.0 0.0 1 0 0
4 Id2331 1998 Jul 27 0 637.26 tier - 3 tier - 3 R1013 22.34 ... No No 1 No Brietzke, Mr. Jordan 0.0 0.0 1 0 0

5 rows × 22 columns

In [44]:
master_noq.state_id.value_counts()
Out[44]:
state_id
R1013    609
R1011    574
R1012    572
R1024    159
R1026     84
R1021     70
R1016     64
R1025     40
R1023     38
R1017     36
R1019     26
R1022     14
R1014     13
R1015     11
R1018      9
R1020      6
Name: count, dtype: int64
In [45]:
# checking the no of records corresponding to R1013

master_noq['state_id_R1011'].value_counts()
Out[45]:
state_id_R1011
0    1751
1     574
Name: count, dtype: int64

6.The variable NumberOfMajorSurgeries also appears to have string values. Apply a suitable method to clean up this variable.¶

In [47]:
master_noq.numberofmajorsurgeries.unique()
Out[47]:
array(['1', 'No major surgery', '2', '3'], dtype=object)
In [48]:
master_noq.loc[master_noq.numberofmajorsurgeries == 'No major surgery','numberofmajorsurgeries' ] = 0
master_noq.numberofmajorsurgeries = master_noq.numberofmajorsurgeries.astype(int)
In [49]:
master_noq.numberofmajorsurgeries.unique()
Out[49]:
array([1, 0, 2, 3])

7. Age appears to be a significant factor in this analysis. Calculate the patients' ages based on their dates of birth.¶

In [51]:
master_noq.year = master_noq.year.astype(int)
In [52]:
master_noq['age'] = 2025 - master_noq.year
In [53]:
master_noq.head()
Out[53]:
customer_id year month date children charges hospital_tier city_tier state_id bmi ... cancer_history numberofmajorsurgeries smoker name city_tier_ord hospital_tier_ord state_id_R1013 state_id_R1011 state_id_R1012 age
0 Id2335 1992 Jul 9 0 563.84 tier - 2 tier - 3 R1013 17.58 ... No 1 No German, Mr. Aaron K 0.0 1.0 1 0 0 33
1 Id2334 1992 Nov 30 0 570.62 tier - 2 tier - 1 R1013 17.60 ... No 1 No Rosendahl, Mr. Evan P 2.0 1.0 1 0 0 33
2 Id2333 1993 Jun 30 0 600.00 tier - 2 tier - 1 R1013 16.47 ... Yes 1 No Albano, Ms. Julie 2.0 1.0 1 0 0 32
3 Id2332 1992 Sep 13 0 604.54 tier - 3 tier - 3 R1013 17.70 ... No 1 No Riveros Gonzalez, Mr. Juan D. Sr. 0.0 0.0 1 0 0 33
4 Id2331 1998 Jul 27 0 637.26 tier - 3 tier - 3 R1013 22.34 ... No 1 No Brietzke, Mr. Jordan 0.0 0.0 1 0 0 27

5 rows × 23 columns

8. The gender of the patient may be an important factor in determining the cost of hospitalization. The salutations in a beneficiary's name can be used to determine their gender. Make a new field for the beneficiary's gender.¶

In [55]:
master_noq.name
Out[55]:
0                     German, Mr.  Aaron K
1                   Rosendahl, Mr.  Evan P
2                       Albano, Ms.  Julie
3       Riveros Gonzalez, Mr.  Juan D. Sr.
4                    Brietzke, Mr.  Jordan
                       ...                
2329                Baker, Mr.  Russell B.
2330                  Kadala, Ms.  Kristyn
2331                  Osborne, Ms.  Kelsey
2333                Lehner, Mr.  Matthew D
2334                     Hawks, Ms.  Kelly
Name: name, Length: 2325, dtype: object
In [56]:
master_noq['title'] = master_noq.name.str.split('[,.]').str[1].str.strip()
In [57]:
master_noq.title.value_counts()
Out[57]:
title
Mr     1160
Ms     1023
Mrs     142
Name: count, dtype: int64
In [58]:
master_noq['gender'] = 'female'
master_noq.loc[master_noq.title == 'Mr', 'gender'] = 'male'
In [59]:
master_noq['gender'].value_counts()
Out[59]:
gender
female    1165
male      1160
Name: count, dtype: int64

9. You should also visualize the distribution of costs using a histogram, box and whisker plot, and swarm plot.¶

In [61]:
plt.figure(figsize = (25,10))
grid = plt.GridSpec(2, 2, wspace=0.4, hspace=0.3)
plt.subplot(grid[0, 0])
plt.hist(master_noq.charges, bins = 50)
plt.subplot(grid[0, 1])
master_noq.charges.plot.kde()
plt.subplot(grid[1, :])
plt.boxplot(master_noq.charges, vert = False)
plt.show()
No description has been provided for this image

10. State how the distribution is different across gender and tiers of hospitals¶

In [63]:
plt.figure(figsize = (15,5))
sns.boxplot(x = "charges",y = "hospital_tier", data = master_noq)
plt.show()
No description has been provided for this image

11. Create a radar chart to showcase the median hospitalization cost for each tier of hospitals¶

In [65]:
master_noq.groupby('hospital_tier')[['charges']].median()
Out[65]:
charges
hospital_tier
tier - 1 32097.435
tier - 2 7168.760
tier - 3 10676.830
In [66]:
median = master_noq.groupby('hospital_tier')[['charges']].median().reset_index()
median
Out[66]:
hospital_tier charges
0 tier - 1 32097.435
1 tier - 2 7168.760
2 tier - 3 10676.830
In [67]:
fig = px.line_polar(median, r='charges', theta='hospital_tier') #, line_close=True
fig.show()

12. Create a frequency table and a stacked bar chart to visualize the count of people in the different tiers of cities and hospitals¶

In [69]:
pd.crosstab(master_noq.city_tier, master_noq.hospital_tier)
Out[69]:
hospital_tier tier - 1 tier - 2 tier - 3
city_tier
tier - 1 85 403 241
tier - 2 106 479 222
tier - 3 109 452 228
In [70]:
pd.crosstab(master_noq.city_tier, master_noq.hospital_tier).plot.bar(stacked = True)
plt.show()
No description has been provided for this image

13. Test the following null hypotheses:¶

- Average hospitalization cost across the 3 types of hospitals is not significantly different  
- Average hospitalization cost across the 3 types of cities is not significantly different 
- Average hospitalization cost for smokers is not significantly different than non-smokers
- Smoking and Hearth issues are independent

H0 : Average hospitalization cost across the 3 types of hospitals is not significantly different¶

In [73]:
ols('charges ~ hospital_tier', data = master_noq).fit()
Out[73]:
<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1ed9c759dc0>
In [74]:
mod = ols('charges ~ hospital_tier', data = master_noq).fit()
res = sm.stats.anova_lm(mod)
res
Out[74]:
df sum_sq mean_sq F PR(>F)
hospital_tier 2.0 9.763011e+10 4.881505e+10 493.989566 1.773822e-179
Residual 2322.0 2.294554e+11 9.881799e+07 NaN NaN

Looking at the p_value and further analysis, we can reject the null hypothesis and conclude that

Average hospitalization costs across the 3 types of hospitals are significantly different

H0 = Average hospitalization cost across the 3 types of cities is not significantly different¶

In [77]:
mod = ols('charges ~ city_tier', data = master_noq).fit()
res = sm.stats.anova_lm(mod)
res
Out[77]:
df sum_sq mean_sq F PR(>F)
city_tier 2.0 4.092192e+08 2.046096e+08 1.454356 0.233763
Residual 2322.0 3.266763e+11 1.406874e+08 NaN NaN

Looking at the p_value and further analysis, we fail to reject the null hypothesis and conclude that

Average hospitalization cost across the 3 types of cities is not significantly different

H0: Average hospitalization cost for smokers is not significantly different than non-smokers¶

In [80]:
sample1 = master_noq.loc[master_noq.smoker == 'yes', 'charges']
sample2 = master_noq.loc[master_noq.smoker != 'yes', 'charges']
stats.ttest_ind(sample1, sample2)
Out[80]:
TtestResult(statistic=74.15560699695726, pvalue=0.0, df=2323.0)

Looking at the p_value, we can reject the null hypothesis and conclude that

Average hospitalization cost for smokers is significantly different than non-smokers

H0 : Smoking and Heart issues are independent4¶

In [83]:
observed_table = pd.crosstab(master_noq.smoker, master_data.heart_issues)
observed_table
Out[83]:
heart_issues No yes
smoker
No 1108 731
yes 297 189
In [84]:
chi, p, df, expected = stats.chi2_contingency(observed_table)
In [85]:
chi, p, df, expected 
Out[85]:
(0.08588150449910657,
 0.7694797581780767,
 1,
 array([[1111.30967742,  727.69032258],
        [ 293.69032258,  192.30967742]]))

Looking at the p_value, we fail to reject the null hypothesis and conclude that

Smoking and Heart issues are independent

Machine Learning¶

1. Examine the correlation between predictors to identify highly correlated predictors¶

In [89]:
data = master_noq.drop(columns = ['customer_id','name', 'year', 'month', 'date','hospital_tier',
       'city_tier', 'state_id' , 'title'])
In [90]:
corr_plot = data.select_dtypes(exclude='object').corr()
ma = np.ones_like(corr_plot)
ma[np.tril_indices_from(ma)] = 0
In [91]:
plt.figure(figsize = (18,5))
sns.heatmap(corr_plot, annot= True , mask = ma, cmap='PuRd')
plt.show()
No description has been provided for this image

2. Develop a regression model Linear or Ridge. Evaluate the model with k-fold cross validation.¶

Also, ensure that you apply all the following suggestions:

  1. Implement the stratified 5-fold cross validation technique for both model building and

validation 2. Utilize effective standardization techniques and hyperparameter tuning 3. Incorporate sklearn-pipelines to streamline the workflow 4. Apply appropriate regularization techniques to address the bias-variance trade-off 5. Create five folds in the data, and introduce a variable to identify the folds 6. Develop Gradient Boost model and determine the variable importance scores, and identify the redundant variables

In [93]:
data_2 = pd.get_dummies(data, drop_first=True)
data_2.reset_index(drop=True, inplace = True)
In [94]:
data_2.head()
Out[94]:
children charges bmi hba1c numberofmajorsurgeries city_tier_ord hospital_tier_ord state_id_R1013 state_id_R1011 state_id_R1012 age heart_issues_yes any_transplants_yes cancer_history_Yes smoker_yes gender_male
0 0 563.84 17.58 4.51 1 0.0 1.0 1 0 0 33 False False False False True
1 0 570.62 17.60 4.39 1 2.0 1.0 1 0 0 33 False False False False True
2 0 600.00 16.47 6.35 1 2.0 1.0 1 0 0 32 False False True False False
3 0 604.54 17.70 6.28 1 0.0 0.0 1 0 0 33 False False False False True
4 0 637.26 22.34 5.57 1 0.0 0.0 1 0 0 27 False False False False True
In [95]:
# rearrange data to put 'charges' as first column or last
model_data = data_2.drop(columns = 'charges')
model_data.head()
model_data['charges'] = data_2.charges
model_data.head()
Out[95]:
children bmi hba1c numberofmajorsurgeries city_tier_ord hospital_tier_ord state_id_R1013 state_id_R1011 state_id_R1012 age heart_issues_yes any_transplants_yes cancer_history_Yes smoker_yes gender_male charges
0 0 17.58 4.51 1 0.0 1.0 1 0 0 33 False False False False True 563.84
1 0 17.60 4.39 1 2.0 1.0 1 0 0 33 False False False False True 570.62
2 0 16.47 6.35 1 2.0 1.0 1 0 0 32 False False True False False 600.00
3 0 17.70 6.28 1 0.0 0.0 1 0 0 33 False False False False True 604.54
4 0 22.34 5.57 1 0.0 0.0 1 0 0 27 False False False False True 637.26
In [96]:
model_data.columns = model_data.columns.str.lower()
In [97]:
model_data.columns
Out[97]:
Index(['children', 'bmi', 'hba1c', 'numberofmajorsurgeries', 'city_tier_ord',
       'hospital_tier_ord', 'state_id_r1013', 'state_id_r1011',
       'state_id_r1012', 'age', 'heart_issues_yes', 'any_transplants_yes',
       'cancer_history_yes', 'smoker_yes', 'gender_male', 'charges'],
      dtype='object')
In [170]:
# converting y to categorical for stratified k fold
y = model_data['charges']
X = model_data.drop(columns = 'charges')
In [172]:
X.head()
Out[172]:
children bmi hba1c numberofmajorsurgeries city_tier_ord hospital_tier_ord state_id_r1013 state_id_r1011 state_id_r1012 age heart_issues_yes any_transplants_yes cancer_history_yes smoker_yes gender_male
0 0 17.58 4.51 1 0.0 1.0 1 0 0 33 False False False False True
1 0 17.60 4.39 1 2.0 1.0 1 0 0 33 False False False False True
2 0 16.47 6.35 1 2.0 1.0 1 0 0 32 False False True False False
3 0 17.70 6.28 1 0.0 0.0 1 0 0 33 False False False False True
4 0 22.34 5.57 1 0.0 0.0 1 0 0 27 False False False False True
In [174]:
#Setting up a pipeline
pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('regressor', Ridge())])
In [176]:
# Defining the parameters for hyperparameter tuning
parameters = {'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
In [178]:
# Creating the KFold object
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
In [180]:
# Creating the grid search object
model_ridge = GridSearchCV(pipeline, parameters, cv=kfold, scoring='neg_mean_squared_error')
In [182]:
model_ridge.fit(X, y)
Out[182]:
GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('regressor', Ridge())]),
             param_grid={'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100]},
             scoring='neg_mean_squared_error')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('regressor', Ridge())]),
             param_grid={'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100]},
             scoring='neg_mean_squared_error')
Pipeline(steps=[('scaler', StandardScaler()), ('regressor', Ridge())])
StandardScaler()
Ridge()
In [184]:
# Getting the best parameters and the best model
model_ridge.best_params_
Out[184]:
{'regressor__alpha': 10}
In [186]:
model_ridge.best_estimator_
Out[186]:
Pipeline(steps=[('scaler', StandardScaler()), ('regressor', Ridge(alpha=10))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()), ('regressor', Ridge(alpha=10))])
StandardScaler()
Ridge(alpha=10)

Gradient Boosting Algorithm¶

In [189]:
from sklearn.ensemble import GradientBoostingRegressor
In [191]:
# Assuming df is your DataFrame
# Use df appropriately to prepare X (input) and y (output)

# Split the data into training and testing sets
# (Make sure to replace X and y with your data appropriately)
X_train,X_test,y_train,y_test = train_test_split(X,y)
# Train the XGBoost model
model = GradientBoostingRegressor()
model.fit(X_train, y_train)


# You can print the feature importances if needed
print(model.feature_importances_)

# Identify redundant variables based on the importance scores
[3.53372903e-03 1.17702556e-01 4.91084867e-03 4.40653774e-04
 9.86447177e-05 2.20557989e-02 4.27397951e-03 5.70459090e-03
 3.89516644e-04 9.22638990e-02 2.24672298e-04 1.96152465e-05
 5.01954928e-05 7.48157095e-01 1.74204589e-04]
In [193]:
pd.DataFrame({'Features':model.feature_names_in_,'Importance':model.feature_importances_}).sort_values("Importance",ascending=False)
Out[193]:
Features Importance
13 smoker_yes 0.748157
1 bmi 0.117703
9 age 0.092264
5 hospital_tier_ord 0.022056
7 state_id_r1011 0.005705
2 hba1c 0.004911
6 state_id_r1013 0.004274
0 children 0.003534
3 numberofmajorsurgeries 0.000441
8 state_id_r1012 0.000390
10 heart_issues_yes 0.000225
14 gender_male 0.000174
4 city_tier_ord 0.000099
12 cancer_history_yes 0.000050
11 any_transplants_yes 0.000020
In [195]:
# train score
model.score(X_train,y_train)
Out[195]:
0.9412334345222239
In [197]:
# test score
model.score(X_test,y_test)
Out[197]:
0.8935237189576851

3. Case scenario: Estimate the cost of hospitalization for Christopher, Ms. Jayna (Date of birth 12/28/1988; height 170 cm; and weight 85 kgs). She lives with her partner and two children in a tier-1 city, and her state’s State ID is R1011. She was found to be nondiabetic (HbA1c = 5.8). She smokes but is otherwise healthy. She has had no transplants or major surgeries. Her father died of lung cancer. Hospitalization costs will be estimated using tier-1 hospitals.¶

4. Find the predicted hospitalization cost using the best models¶

In [203]:
model_data.columns
Out[203]:
Index(['children', 'bmi', 'hba1c', 'numberofmajorsurgeries', 'city_tier_ord',
       'hospital_tier_ord', 'state_id_r1013', 'state_id_r1011',
       'state_id_r1012', 'age', 'heart_issues_yes', 'any_transplants_yes',
       'cancer_history_yes', 'smoker_yes', 'gender_male', 'charges'],
      dtype='object')
In [215]:
pred_data = pd.DataFrame({'Name' : ['Christopher, Ms. Jayna'],
                      'DOB' : ['12/28/1988'],
                      'city_tier' : ['tier - 1'], 'children' :[ 2],
                       'HbA1c' : [5.8], 
                       'smoker_yes' : [1],
                       'heart_issues_yes' : [0],
                       'any_transplants_yes' : [0],
                       'numberofmajorsurgeries' :[ 0],
                       'cancer_history_yes' : [1],
                       'hospital_tier' : ['tier - 1'],
                       'bmi' : [85/(1.70 **2)],
                       'state_id_R1011' : [1]
                      })
In [217]:
pred_data.columns = pred_data.columns.str.lower()   
In [219]:
pred_data['gender_male']  = 0
pred_data.loc[pred_data.name.str.split('[,.]').str[1] == 'Mr', 'gender_male'] = 1
pred_data.drop(columns = 'name', inplace = True)
pred_data
Out[219]:
dob city_tier children hba1c smoker_yes heart_issues_yes any_transplants_yes numberofmajorsurgeries cancer_history_yes hospital_tier bmi state_id_r1011 gender_male
0 12/28/1988 tier - 1 2 5.8 1 0 0 0 1 tier - 1 29.411765 1 0
In [221]:
pred_data.drop(columns = 'dob', inplace = True)
In [223]:
pred_data[['city_tier_ord', 'hospital_tier_ord']] = ordinal.transform(pred_data[['city_tier', 'hospital_tier']])
In [225]:
pred_data.drop(columns =['city_tier', 'hospital_tier'], inplace = True )
In [227]:
for col in model_data.columns:
    if col not in pred_data.columns and col != 'charges':
        pred_data[col] = 0
In [229]:
pred_data
Out[229]:
children hba1c smoker_yes heart_issues_yes any_transplants_yes numberofmajorsurgeries cancer_history_yes bmi state_id_r1011 gender_male city_tier_ord hospital_tier_ord state_id_r1013 state_id_r1012 age
0 2 5.8 1 0 0 0 1 29.411765 1 0 2.0 2.0 0 0 0
In [231]:
### Apply Gradient BOOST model for predi
model_data.columns
Out[231]:
Index(['children', 'bmi', 'hba1c', 'numberofmajorsurgeries', 'city_tier_ord',
       'hospital_tier_ord', 'state_id_r1013', 'state_id_r1011',
       'state_id_r1012', 'age', 'heart_issues_yes', 'any_transplants_yes',
       'cancer_history_yes', 'smoker_yes', 'gender_male', 'charges'],
      dtype='object')
In [233]:
pred_data=pred_data[model_data.drop(columns='charges').columns]
In [235]:
model.predict(pred_data)
Out[235]:
array([23963.49414349])
In [ ]: